
# Checking to make sure I calculated things correctly 
region_gen_dt[fuel_category %in% c('coal','petroleum'),.(
  nerc_adj, 
  fuel_category,
  n_plants = num_plants, 
  net_gen_twh = round(tot_net_generation_mwh/1e5, 2),
  nox_emissions_lb_mwh = round(nox_lb/tot_net_generation_mwh, 2),
  so2_emissions_lb_mwh = round(so2_lb/tot_net_generation_mwh, 2),
  co2e_emissions_lb_mwh = round(co2e_lb/tot_net_generation_mwh, 2)
)][order(fuel_category)]
# Happening in particular in CAL, NPCC


# Reading data for all of the petroleum plants 
petroleum_plant_gen_fp = 
  plant_gen_fp[
    str_extract(plant_gen_fp, '(?<=-)\\d*(?=\\.fst)') |> as.integer()
    %in% plant_info_dt[fuel_category == 'petroleum']$plant_id_eia
  ]
petroleum_elec_gen_dt = 
  map_dfr(
    petroleum_plant_gen_fp, 
    \(fp_in){
      read.fst(
        path = fp_in,
        as.data.table = TRUE
      ) |>
      merge(
        plant_info_dt, 
        by = 'plant_id_eia'
      )
    }
  )[,.(
    tot_net_generation_mwh = sum(net_generation_mwh),
    tot_net_generation_mwh_raw = sum(net_generation_mwh_raw),
    tot_fit_net_generation_mwh = sum(fit_net_generation_mwh),
    nox_lb = sum(nox_mass_lb_for_electricity),
    so2_lb = sum(so2_mass_lb_for_electricity),
    co2e_lb = sum(co2e_mass_lb_for_electricity),
    num_hours = .N
  ), 
    keyby = .(
      plant_id_eia, 
      nerc_adj
    )
  ]
petroleum_elec_gen_dt[
  plant_id_eia != '50626',.(
  plant_id_eia,
  nox_emissions_lb_mwh_raw = round(nox_lb/tot_net_generation_mwh_raw, 2),
  so2_emissions_lb_mwh_raw = round(so2_lb/tot_net_generation_mwh_raw, 2),
  co2e_emissions_lb_mwh_raw = round(co2e_lb/tot_net_generation_mwh_raw, 2), 
  so2_lb
)][order(-so2_emissions_lb_mwh_raw)]  |>
print(n = 10000)

petroleum_elec_gen_dt[so2_lb/tot_net_generation_mwh_raw > 100]


p_load(ggplot2)

plant_info_dt[plant_id_eia == '50626']

ex_plant_dt = 
  read.fst(
    path = petroleum_plant_gen_fp |> str_subset('50626'), 
    as.data.table= TRUE
  )

ggplot(ex_plant_dt, aes(x = co2e_mass_lb_for_electricity)) + 
geom_density()

ggplot(ex_plant_dt, aes(x = net_generation_mwh, y = so2_mass_lb_for_electricity)) + 
geom_point(alpha = 0.05)
ggplot(ex_plant_dt, aes(x = net_generation_mwh, y = co2e_mass_lb_for_electricity)) + 
geom_point(alpha = 0.05)

plant_info_dt[plant_id_eia == '2528']
# Checking raw OGE data 
oge_raw = fread(
  here('Data/electricity-generation/open-grid-emissions/v0.2.0/2019_plant_data_hourly_us_units/NYIS.csv')
)

ggplot(oge_raw[plant_id_eia == '2528'], aes(x = so2_mass_lb_for_electricity)) + 
geom_density() + 

ggplot(oge_raw[plant_id_eia == '8023'], aes(x = net_generation_mwh, y = co2_mass_lb_for_electricity_adjusted)) + 
geom_point(alpha = 0.05)

# Checking udpated OGE data
oge_raw_v3 = fread(
  here('Data/electricity-generation/open-grid-emissions/v0.3.0/2019_plant_data_hourly_us_units/NYIS.csv')
)

ggplot(oge_raw_v3[plant_id_eia == '50626' & co2_mass_lb_for_electricity_adjusted/net_generation_mwh < 20000], aes(x = co2_mass_lb_for_electricity_adjusted/net_generation_mwh)) + 
geom_histogram(bins = 1000)

p_load(collapse)
p99 = fnth(oge_raw_v3[plant_id_eia == '50626']$net_generation_mwh, n = .9941)
so2_censor = fnth(oge_raw_v3[plant_id_eia == '50626']$so2_mass_lb_for_electricity, n = .9941)

plant_info_dt[plant_id_eia == '50626']

ggplot(
  oge_raw_v3[plant_id_eia == '50626'], 
  aes(x = net_generation_mwh, y = so2_mass_lb_for_electricity/2000)) + 
geom_point(alpha = 0.25) + 
geom_vline(xintercept = 5.7, linetype = 'dashed') + 
theme_minimal() + 
labs(
  x = 'Net Generation (MWh)', 
  y = 'SO2 Emissions (tons)'
) + 
xlim(c(
  0, 
  max(oge_raw_v3[plant_id_eia == '50626']$net_generation_mwh)
)) + 
ylim(c(
  0, 
  max(oge_raw_v3[plant_id_eia == '50626']$so2_mass_lb_for_electricity)/2000
)) 

ggplot(
  ex_plant_dt[plant_id_eia == '50626'], 
  aes(x = net_generation_mwh, y = so2_mass_lb_for_electricity/2000)) + 
geom_point(alpha = 0.25) + 
geom_point(aes(x = net_generation_mwh_raw), color = 'red')+  
geom_vline(xintercept = 5.7, linetype = 'dashed') + 
theme_minimal() + 
labs(
  x = 'Net Generation (MWh)', 
  y = 'SO2 Emissions (tons)'
)+ 
xlim(c(
  0, 
  max(oge_raw_v3[plant_id_eia == '50626']$net_generation_mwh)
)) + 
ylim(c(
  0, 
  max(oge_raw_v3[plant_id_eia == '50626']$so2_mass_lb_for_electricity)/2000
)) 

ggplot(
  ex_plant_dt[plant_id_eia == '50626',.(
    net_generation_mwh, 
    so2_censored = fcase(
      so2_mass_lb_for_electricity >= so2_censor, so2_censor, 
      so2_mass_lb_for_electricity < so2_censor, so2_mass_lb_for_electricity
    )
  )], 
  aes(x = net_generation_mwh, y = so2_censored/2000)) + 
geom_point(alpha = 0.25) + 
geom_vline(xintercept = 5.7, linetype = 'dashed') + 
theme_minimal() + 
labs(
  x = 'Net Generation (MWh)', 
  y = 'SO2 Emissions (tons)'
)+ 
xlim(c(
  0, 
  max(oge_raw_v3[plant_id_eia == '50626']$net_generation_mwh)
)) + 
ylim(c(
  0, 
  max(oge_raw_v3[plant_id_eia == '50626']$so2_mass_lb_for_electricity)/2000
)) 
# Lets see what fuel category averages are in old and new data 
merge(
  oge_raw_v3, 
  plant_info_dt, 
  by = 'plant_id_eia'
)[,.(
  tot_net_generation_mwh = sum(net_generation_mwh),
  nox_lb_mwh = sum(nox_mass_lb_for_electricity)/sum(net_generation_mwh),
  so2_lb_mwh = sum(so2_mass_lb_for_electricity)/sum(net_generation_mwh),
  co2e_lb_mwh = sum(co2_mass_lb_for_electricity_adjusted)/sum(net_generation_mwh),
  num_hours = .N), 
  keyby = .(fuel_category)
]

merge(
  oge_raw_v3, 
  plant_info_dt, 
  by = 'plant_id_eia'
)[,.(
  tot_net_generation_mwh = sum(net_generation_mwh),
  nox_lb_mwh = sum(nox_mass_lb_for_electricity)/sum(net_generation_mwh),
  so2_lb_mwh = sum(so2_mass_lb_for_electricity)/sum(net_generation_mwh),
  co2e_lb_mwh = sum(co2_mass_lb_for_electricity_adjusted)/sum(net_generation_mwh),
  num_hours = .N), 
  keyby = .(fuel_category)
]

plant_info_dt[plant_id_eia == '50626']

p_load(readxl, janitor)
egrid_2019 = 
  read_xlsx(
    here('Data/electricity-generation/egrid/egrid2019_data.xlsx'), 
    sheet = 'PLNT19', 
    skip = 1
  ) |>
  data.table()
egrid_2019[ORISPL == '2528',.(
  source = 'egrid',
  net_generation_mwh = PLNGENAN, 
  nox_lbs = 2000*PLNOXAN, 
  so2_lbs = 2000*PLSO2AN, 
  co2_lbs = 2000*PLCO2AN, 
  nox_lbs_mwh = 2000*PLNOXAN/PLNGENAN, 
  so2_lbs_mwh = 2000*PLSO2AN/PLNGENAN ,
  co2_lbs_mwh = 2000*PLCO2AN/PLNGENAN
 )] |>
 rbind(
  oge_raw_v3[plant_id_eia == '2528',.(
  source = 'oge',
  net_generation_mwh = sum(net_generation_mwh), 
  nox_lbs = sum(nox_mass_lb_for_electricity),
  so2_lbs = sum(so2_mass_lb_for_electricity),
  co2_lbs = sum(co2_mass_lb_for_electricity),
  nox_lbs_mwh = sum(nox_mass_lb_for_electricity)/sum(net_generation_mwh), 
  so2_lbs_mwh = sum(so2_mass_lb_for_electricity)/sum(net_generation_mwh), 
  co2_lbs_mwh = sum(co2_mass_lb_for_electricity)/sum(net_generation_mwh)
 )]
 )

# Reading elec gen data 
elec_gen_dt_cal = 
  read.fst(
      path = here(
        "Data/electricity-generation/elec-gen-dt",
        "elec-gen-dt-cal.fst"
      ), 
      as.data.table = TRUE
    )

tmp = elec_gen_dt_cal[,.(
  tot_mwh = sum(net_generation_mwh_raw), 
  tot_so2 = sum(so2_mass_lb_for_electricity), 
  tot_nox = sum(nox_mass_lb_for_electricity), 
  tot_co2e = sum(co2e_mass_lb_for_electricity)
), keyby = .(fuel_category, plant_id_eia)
]
tmp[order(tot_so2/tot_mwh) & fuel_category == 'petroleum']



# So can we look at average emissions for outliers? 
summarize_plant_gen_dt_plant = function(nerc_adj_in){
  # Getting file paths for all plants in region
  nerc_plant_gen_fp = 
    plant_gen_fp[
      str_extract(plant_gen_fp, '(?<=-)\\d*(?=\\.fst)') |> as.integer()
      %in% plant_info_dt[nerc_adj == nerc_adj_in]$plant_id_eia
    ]
  # Summarizing generation by time, ic, nerc, fuel cat
  elec_gen_dt = 
    map_dfr(
      nerc_plant_gen_fp, 
      \(fp_in){
        read.fst(
          path = fp_in,
          as.data.table = TRUE
        ) |>
        merge(
          plant_info_dt, 
          by = 'plant_id_eia'
        )
      }
    )[,.(
      tot_net_generation_mwh = sum(net_generation_mwh_raw),
      tot_fit_net_generation_mwh = sum(fit_net_generation_mwh),
      nox_lb = sum(nox_mass_lb_for_electricity),
      so2_lb = sum(so2_mass_lb_for_electricity),
      co2e_lb = sum(co2e_mass_lb_for_electricity),
      num_hours = .N,
      num_plants = uniqueN(plant_id_eia)
    ), 
      keyby = .(
        nerc_adj, 
        fuel_category, 
        plant_id_eia
      )
    ]
  return(elec_gen_dt)
}

plant_summary_dt =     
  map_dfr(
    c('CAL','MRO','NPCC','RFC','SERC','TRE','WECC'),
    summarize_plant_gen_dt_plant
  )
plant_summary_dt[,":="(
  nox = nox_lb/tot_net_generation_mwh,
  so2 = so2_lb/tot_net_generation_mwh,
  co2e = co2e_lb/tot_net_generation_mwh
)] 
plant_summary_dt_long = melt(
  plant_summary_dt, 
  id.vars = c('nerc_adj','fuel_category','plant_id_eia'), 
  measure.vars = c('nox','so2','co2e')
)
merge(
  plant_summary_dt_long, 
  plant_summary_dt_long[,
    .(q99 = quantile(value, probs = 0.99)), 
    keyby = .(fuel_category, variable)
  ], 
  by = c('fuel_category','variable')
) |>
ggplot(aes(x=value, color = value > q99, fill = value > q99))+
geom_histogram(bins = 100) + 
facet_wrap(
  #rows = vars(fuel_category), 
  #cols = vars(variable), 
  ~fuel_category + variable,
  scales = 'free'
) + 
#scale_x_log10() +
theme_minimal()

# Lets look at the petroleum plants 
questionable_list = 
  merge(
    plant_summary_dt_long, 
    plant_summary_dt_long[,
      .(q99 = quantile(value, probs = 0.99)), 
      keyby = .(fuel_category, variable)
    ], 
    by = c('fuel_category','variable')
  )[value > q99]


plant_summary_dt[
  plant_id_eia %in% unique(questionable_list$plant_id_eia)
  ,.(
    nerc_adj, plant_id_eia, 
    nox = round(nox, 2), 
    so2 = round(so2, 2), 
    co2e = round(co2e, 2) 
  ),
  keyby = .(fuel_category, -tot_net_generation_mwh)
]

# Exclusion list 
naughty_list = 
  plant_summary_dt[
    plant_id_eia %in% unique(questionable_list$plant_id_eia) & 
    (
      (fuel_category == 'petroleum') |
      (fuel_category == 'natural_gas' & (
        nox > 1999 | so2 > 1000 | co2e > 100000
      )) 
    )|(plant_id_eia %in% c(50931))
  ] |>
  merge(
    egrid_2019[,.(
      plant_id_eia = ORISPL, 
      name_plant = PNAME, 
      name_operator = OPRNAME,
      egrid_net_generation_mwh = PLNGENAN, 
      egrid_nox_lbs = 2000*PLNOXAN, 
      egrid_so2_lbs = 2000*PLSO2AN, 
      egrid_co2_lbs = 2000*PLCO2AN, 
      egrid_nox_lbs_mwh = 2000*PLNOXAN/PLNGENAN, 
      egrid_so2_lbs_mwh = 2000*PLSO2AN/PLNGENAN ,
      egrid_co2_lbs_mwh = 2000*PLCO2AN/PLNGENAN
    )], 
    by = 'plant_id_eia', 
    all.x = TRUE
  )

naughty_list[,.(
  fuel_category,
  plant_id_eia, 
  oge_net_generation_mwh = round(tot_net_generation_mwh, 1),
  egrid_net_generation_mwh = round(egrid_net_generation_mwh,1),
  oge_nox_lbs_mwh = round(nox, 1), 
  egrid_nox_lbs_mwh = round(egrid_nox_lbs_mwh,1),
  oge_so2_lbs_mwh = round(so2, 1), 
  egrid_so2_lbs_mwh = round(egrid_so2_lbs_mwh,1),
  oge_co2e_lbs_mwh = round(co2e, 1), 
  egrid_co2_lbs_mwh = round(egrid_co2_lbs_mwh,1),
  oge_nox_lbs_egrid_mwh = round(nox_lb/egrid_net_generation_mwh, 1),
  oge_so2_lbs_egrid_mwh = round(so2_lb/egrid_net_generation_mwh, 1),
  oge_co2e_lbs_egrid_mwh = round(co2e_lb/egrid_net_generation_mwh, 1)
)]
# So...
# 2221, 2528 seem off in both datasets 
# 2503, 50626 considerably lower emissions in eGRID, similar mwh
# 2528
# Others are because mwh underreported in OGE
naughty_list[plant_id_eia %in% c(2221, 2528, 2503, 50626)]

merge(
  plant_summary_dt_long[fuel_category %in% c('natural_gas','petroleum')], 
  plant_summary_dt_long[,
    .(q99 = quantile(value, probs = 0.99)), 
    keyby = .(fuel_category, variable)
  ], 
  by = c('fuel_category','variable')
) |>
ggplot(aes(
  x=value, 
  color = plant_id_eia %in% c(2221, 2528, 2503, 50626, 50931), 
  fill = plant_id_eia %in% c(2221, 2528, 2503, 50626, 50931)
))+
geom_histogram(bins = 100) + 
facet_wrap(
  #rows = vars(fuel_category), 
  #cols = vars(variable), 
  ~fuel_category + variable,
  scales = 'free'
) + 
#scale_x_log10() +
labs(color = 'Bad data', fill = 'Bad data') + 
theme_minimal()


plant_summary_dt_long[
  fuel_category %in% c('natural_gas','petroleum') & 
  !(plant_id_eia %in% naughty_list$plant_id_eia)
]|>
ggplot(aes(x=value))+
geom_histogram(bins = 100) + 
facet_wrap(
  #rows = vars(fuel_category), 
  #cols = vars(variable), 
  ~fuel_category + variable,
  scales = 'free'
) + 
#scale_x_log10() +
theme_minimal()


# SO2 still high 
merge(
  plant_summary_dt_long, 
  plant_summary_dt_long[,
    .(q99 = quantile(value, probs = 0.99)), 
    keyby = .(fuel_category, variable)
  ], 
  by = c('fuel_category','variable')
)[fuel_category == 'petroleum' & variable == 'so2' & value > q99]

plant_summary_dt[
  fuel_category == 'petroleum' & 
  so2 > 39 & 
  !(plant_id_eia %in% c(2221, 2528, 2503, 50626))
  ,.(
    plant_id_eia, 
    nerc_adj, 
    mwh = tot_net_generation_mwh, 
    nox_lb = round(nox_lb,1), 
    so2_lb = round(so2_lb,1), 
    co2e_lb = round(co2e_lb,1), 
    nox_lb_mwh = round(nox,1), 
    so2_lb_mwh = round(so2,1), 
    co2e_lb_mwh = round(co2e,1)
  )
][order(-so2_lb)]

# What if we merged egrid w/oge to find big differences 
check_oge_egrid_dt = 
  merge(
    plant_summary_dt,
    egrid_2019[,.(
      plant_id_eia = ORISPL, 
      name_plant = PNAME, 
      name_operator = OPRNAME,
      egrid_net_generation_mwh = PLNGENAN, 
      egrid_nox_lb = 2000*PLNOXAN, 
      egrid_so2_lb = 2000*PLSO2AN, 
      egrid_co2_lb = 2000*PLCO2AN, 
      egrid_nox_lb_mwh = 2000*PLNOXAN/PLNGENAN, 
      egrid_so2_lb_mwh = 2000*PLSO2AN/PLNGENAN ,
      egrid_co2_lb_mwh = 2000*PLCO2AN/PLNGENAN
    )], 
    by = 'plant_id_eia', 
    all.x = TRUE
  )
  
ggplot(
  check_oge_egrid_dt, 
  aes(x = nox - egrid_nox_lb_mwh)
) +
geom_density()


  check_oge_egrid_dt[
    abs(so2_lb/egrid_net_generation_mwh - egrid_so2_lb_mwh) > 40
    #plant_id_eia == 50931
    ,.(
    fuel_category,
    plant_id_eia, 
    oge_net_generation_mwh = tot_net_generation_mwh,
    egrid_net_generation_mwh = egrid_net_generation_mwh,
    oge_nox_lb_mwh = nox, 
    egrid_nox_lb_mwh = egrid_nox_lb_mwh,
    oge_so2_lb_mwh = so2, 
    egrid_so2_lb_mwh = egrid_so2_lb_mwh,
    oge_co2e_lb_mwh = co2e, 
    egrid_co2_lb_mwh = egrid_co2_lb_mwh,
    oge_nox_lb_egrid_mwh = nox_lb/egrid_net_generation_mwh,
    oge_so2_lb_egrid_mwh = so2_lb/egrid_net_generation_mwh,
    oge_co2e_lb_egrid_mwh = co2e_lb/egrid_net_generation_mwh
  )]


quantile(
  check_oge_egrid_dt[,.(abs(so2_lb/egrid_net_generation_mwh - egrid_so2_lb_mwh))]$V1, 
  probs = seq(0.99,1, by = 0.001),
  na.rm = TRUE
)
quantile(
  check_oge_egrid_dt[,.(abs(so2 - egrid_so2_lb_mwh))]$V1, 
  probs = seq(0.99,1, by = 0.001),
  na.rm = TRUE
)

# Ok so lets figure out why OGE assigned weird values to these plants 
check_oge_egrid_dt[
  plant_id_eia %in% c(2221, 2528, 2503, 50626, 50931),.(
    fuel_category,
    plant_id_eia, 
    oge_net_generation_mwh = round(tot_net_generation_mwh,1),
    egrid_net_generation_mwh = round(egrid_net_generation_mwh,1),
    oge_nox_lb_mwh = round(nox, 1),
    egrid_nox_lb_mwh = round(egrid_nox_lb_mwh,1),
    oge_so2_lb_mwh = round(so2, 1),
    egrid_so2_lb_mwh = round(egrid_so2_lb_mwh,1),
    oge_co2e_lb_mwh = round(co2e, 1),
    egrid_co2_lb_mwh = round(egrid_co2_lb_mwh,1),
    oge_nox_lb_egrid_mwh = round(nox_lb/egrid_net_generation_mwh,1),
    oge_so2_lb_egrid_mwh = round(so2_lb/egrid_net_generation_mwh,1),
    oge_co2e_lb_egrid_mwh = round(co2e_lb/egrid_net_generation_mwh,1)
)]



cems_2019 = read.fst(
  here('Data/electricity-generation/cems/raw_cems_2019.fst'), 
  as.data.table = TRUE
)[orispl_code %in% c(2221, 2528, 2503, 50626, 50931)]
p_load(collapse)
cems_2019[,.(
  tot_gload_mw = fsum(gload_mw), 
  so2_lbs = fsum(so2_mass_lbs), 
  nox_lbs = fsum(nox_mass_lbs), 
  co2_lbs = 2000*fsum(co2_mass_tons)
),keyby = orispl_code]
# CEMS only has 2503, and only reports NOx, also high there. 